Loading Data
The following libraries will be used.
library(data.table) # database-like tables
library(ggplot2) # general plots
library(highcharter) # interactive plots
library(DT) # display tables
library(corrplot) # corr SPLOMs
library(vcd) # stats for categories
library(mice) # for imputation
Input is loaded and covnerted to data.table.
test <- read.csv("../input/test.csv")
train <- read.csv("../input/train.csv")
test <- data.table(test)
train <- data.table(train)
The number of dimensions and density of the input matrix is calculated. The datasets are combined.
(N <- nrow(train))
## [1] 1460
(p <- ncol(train) - 2)
## [1] 79
N^(1/p)
## [1] 1.096617
test$SalePrice <- NaN
full <- rbind(train, test)
From the data_description variable types can be defined. There are 4 categories: nominal, ordinal, discrete, continuous. A lot of the discrete variables could arguably be handled as ordinals.
nomVars <- c("MSZoning", "Street", "Alley", "LandContour", "LotConfig",
"Neighborhood", "Condition1", "Condition2", "HouseStyle",
"RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd",
"MasVnrType", "Foundation", "Heating", "CentralAir", "Electrical",
"GarageType", "PavedDrive", "MiscFeature", "SaleType",
"SaleCondition")
ordVars <- c("MSSubClass", "LotShape", "Utilities", "LandSlope", "BldgType",
"OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual",
"BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
"HeatingQC", "KitchenQual", "Functional", "FireplaceQu",
"GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence",
"MoSold")
discVars <- c("YearBuilt", "YearRemodAdd", "BsmtFullBath", "BsmtHalfBath",
"FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
"TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars",
"YrSold")
contVars <- c("LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
"TotalBsmtSF", "X1stFlrSF", "X2ndFlrSF", "LowQualFinSF",
"GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF",
"EnclosedPorch", "X3SsnPorch", "ScreenPorch", "PoolArea",
"MiscVal", "LotFrontage")
A table is created to store feature attributes.
featAttr <- data.table(
name = colnames(test)[-1],
type = "nominal"
)
featAttr[name %in% ordVars, type := "ordinal"]
featAttr[name %in% discVars, type := "discrete"]
featAttr[name %in% contVars, type := "continuous"]
Special Values
According to data_description several \(NA\) have a special meaning. These values are replaced with more descriptive chr strings. Before, test and train set are combined.
full$label <- rep(c("train", "test"), c(nrow(test), nrow(train)))
full[is.na(Alley), Alley := "noAccess"]
full[is.na(BsmtQual), BsmtQual := "noBasement"]
full[is.na(BsmtCond), BsmtCond := "noBasement"]
full[is.na(BsmtExposure), BsmtExposure := "noBasement"]
full[is.na(BsmtFinType1), BsmtFinType1 := "noBasement"]
full[is.na(BsmtFinType2), BsmtFinType2 := "noBasement"]
full[is.na(FireplaceQu), FireplaceQu := "noFireplace"]
full[is.na(GarageType), GarageType := "noGarage"]
full[is.na(GarageFinish), GarageFinish := "noGarage"]
full[is.na(GarageQual), GarageQual := "noGarage"]
full[is.na(GarageCond), GarageCond := "noGarage"]
full[is.na(PoolQC) , PoolQC := "noPool"]
full[is.na(Fence), Fence := "noFence"]
full[is.na(MiscFeature), MiscFeature := "none"]
Nominal, ordinal, and discrete variables are converted to factor, continuous to numeric.
for (j in c(nomVars, ordVars, discVars)) full[[j]] <- as.factor(full[[j]])
for (j in contVars) full[[j]] <- as.numeric(full[[j]])
Ordering
From data_description the order of ordinal variable classes can be derived. These orderings are defined, then the levels of their factors are adjusted.
lvlOrd <- list(
LotShape = c("Reg", "IR1", "IR2", "IR3"),
Utilities = c("AllPub", "NoSewr", "NoSeWa", "ELO"),
LandSlope = c("Gtl", "Mod", "Sev"),
BldgType = c("1Fam", "2FmCon", "Duplx", "TwnhsE", "TwnhsI"),
OverallQual = 10:1,
OverallCond = 10:1,
ExterQual = c("Ex", "Gd", "TA", "Fa", "Po"),
ExterCond = c("Ex", "Gd", "TA", "Fa", "Po"),
BsmtQual = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
BsmtCond = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
BsmtExposure = c("Gd", "Av", "Mn", "No", "noBasement"),
BsmtFinType1 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
BsmtFinType2 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
HeatingQC = c("Ex", "Gd", "TA", "Fa", "Po"),
KitchenQual = c("Ex", "Gd", "TA", "Fa", "Po"),
Functional = c("Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal"),
FireplaceQu = c("Ex", "Gd", "TA", "Fa", "Po", "noFireplace"),
GarageFinish = c("Fin", "RFn", "Unf", "noGarage"),
GarageQual = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
GarageCond = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
PoolQC = c("Ex", "Gd", "TA", "Fa", "noPool"),
Fence = c("GdPrv", "MnPrv", "GdWo", "MnWw", "noFence"),
MoSold = 1:12
)
for (j in names(lvlOrd)) {
full[[j]] <- factor(full[[j]], levels = lvlOrd[[j]])
}
Some features’ values heavily depend on the value of another feature. For example the values of \(BsmtUnfSF\), \(TotalBsmtSF\) only make sense if features \(BsmtCond\), \(BsmtExposure\), \(BsmtQual\) are not \(noBasement\). This information is given in data_description.
\[ \begin{aligned} BsmtUnfSF, TotalBsmtSF : BsmtCond, BsmtExposure, BsmtQual \ne noBasement\\ BsmtFinSF1 : BsmtFinType1 \ne noBasement\\ BsmtFinSF2 : BsmtFinType2 \ne noBasement \\ GarageYrBlt, GarageCars, GarageArea: GarageType, GarageFinish, GarageQual, GarageCond \ne noGarage \\ PoolArea : PoolQC \ne noPool \\ MiscVal : MiscFeature \ne none \\ MasVnrArea : MasVnrType \ne none \end{aligned} \] For the garage and basement features there are several features which indicate the absence of that feature. If one of them indicates the absence of a feature, the others should too.
setdiff(which(full$BsmtCond == "noBasement"),
which(full$BsmtExposure == "noBasement"))
## [1] 2041 2186 2525
setdiff(which(full$BsmtCond == "noBasement"),
which(full$BsmtQual == "noBasement"))
## [1] 2041 2186 2525
full[c(2041, 2186, 2525), .(Id, BsmtCond, BsmtExposure, BsmtQual,
BsmtUnfSF, TotalBsmtSF)]
## Id BsmtCond BsmtExposure BsmtQual BsmtUnfSF TotalBsmtSF
## 1: 2041 noBasement Mn Gd 0 1426
## 2: 2186 noBasement No TA 94 1127
## 3: 2525 noBasement Av TA 240 995
So here the data_description is not consistent. The general condition is set to noBasement but there are values for basement exposure, quality, and so on.
setdiff(which(full$GarageType == "noGarage"),
which(full$GarageFinish == "noGarage"))
## integer(0)
setdiff(which(full$GarageType == "noGarage"),
which(full$GarageQual == "noGarage"))
## integer(0)
setdiff(which(full$GarageType == "noGarage"),
which(full$GarageCond == "noGarage"))
## integer(0)
The garage features doesn’t have these inconsistencies.
Mark Undefined Variables
If e.g. PoolQC is noPool, then there should be a missing vlaue or a zero for PoolArea. If there actually is a value, this could either mean it is a mistake or it means something special that is not clearly described. To at least be consistent, all NAs will be set to 0.
full[BsmtCond == "noBasement" & is.na(BsmtUnfSF), BsmtUnfSF := 0]
full[BsmtCond == "noBasement" & is.na(TotalBsmtSF), TotalBsmtSF := 0]
full[BsmtFinType1 == "noBasement" & is.na(BsmtFinSF1), BsmtFinSF1 := 0]
full[BsmtFinType2 == "noBasement" & is.na(BsmtFinSF2), BsmtFinSF2 := 0]
full[GarageType == "noGarage" & is.na(GarageYrBlt), GarageYrBlt := "0"]
full[GarageType == "noGarage" & is.na(GarageCars), GarageCars := "0"]
full[GarageType == "noGarage" & is.na(GarageArea), GarageArea := 0]
full[PoolQC == "noPool" & is.na(PoolArea), PoolArea := 0]
full[MiscFeature == "noFeature" & is.na(MiscVal), MiscVal := 0]
\(NA\) which are left must be true missing data.
naFeats <- apply(full[, !"SalePrice"], 2, function(x) any(is.na(x)))
naObs <- apply(full[, !"SalePrice"], 1, function(x) any(is.na(x)))
Features
The total number of missing values per feature will be saved. The features with at least one missing value are shown below.
total <- apply(full[, naFeats, with = FALSE], 2, function(j) sum(is.na(j)))
df <- data.frame(
missing = total / nrow(full),
name = factor(names(total), levels = names(total)[order(total)]),
type = featAttr[match(names(total), featAttr$name), type]
)
ggplot(df, aes(x = name, y = missing, fill = type)) +
geom_bar(stat = "identity") +
scale_y_continuous("proportion missing") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Features with Missing Values")
Observations
Below the number of missing values for observations that have missing values is shown. Many observations have 1 or 2, some have 3 or 4 missing values.
total <- apply(full[naObs, ], 1, function(i) sum(is.na(i)))
df <- data.frame(
y = total,
name = full[naObs, Id]
)
highchart() %>%
hc_chart(type = "column") %>%
hc_title(text = "Observations with Missing Values") %>%
hc_xAxis(title = list(text = "observations"), categories = df$name,
labels = list(rotation = "-45")) %>%
hc_yAxis(title = list(text = "missing"), align = "left") %>%
hc_add_series(showInLegend = FALSE, data = list_parse(df[order(df$y), ]))
Feature distributions are described for each type of variable separately.
Values were log10 transformed to better capture their spread. There are many zeros, partly because of the dependency to another variable. These zeros are excluded during the transformation.
dt <- melt(full, "Id", contVars)
ggplot(dt, aes(x = value)) +
geom_histogram(bins = 30) +
scale_x_continuous(trans = "log10") +
facet_wrap(~ variable, scale = "free") +
theme_bw()
Summaries for each distribution are computed. There are many undefined variables which are set to zero. This influences the description a lot. Therefore, they are excluded.
dt <- full[, contVars, with = FALSE]
df <- data.frame(
Min = apply(dt, 2, function(x) min(x[x > 0], na.rm = TRUE)),
Mean = apply(dt, 2, function(x) mean(x[x > 0], na.rm = TRUE)),
Median = apply(dt, 2, function(x) median(x[x > 0], na.rm = TRUE)),
Max = apply(dt, 2, function(x) max(x[x > 0], na.rm = TRUE))
)
Relative difference between mean and median (relDelta) are computed as an indicator for skewness. Also, relative cardinality and relative number of outliers are computed. Outliers are defined as being differing at least \(+/- 1.5 MAD\) from the median.
df$relDelta <- abs(df$Mean - df$Median) / df$Mean
df$relCard <- apply(dt, 2, function(x) {
length(unique(x[x > 0])) / length(x[x > 0])
})
df$relOut <- apply(dt, 2, function(x) {
length(boxplot.stats(x[x > 0])$out) / length(x[x > 0])
})
PoolArea and MiscVal show a high proportion of outliers, but this might be because there are not many total values for these variables. Many supposably continuous variables have a quite low cardinality.
DT::datatable(df, options = list(dom = "t", pageLength = 40)) %>%
formatStyle('relDelta',
color = styleInterval(c(0.2, 0.8), c('green', 'orange', 'red'))) %>%
formatStyle('relCard',
color = styleInterval(c(0.2, 0.6), c('red', 'orange', 'green'))) %>%
formatStyle('relOut',
color = styleInterval(c(0.05, 0.1), c('green', 'orange', 'red'))) %>%
formatRound(c(2, 5:7), 3)
Distributions are shown below.
dt <- melt(full, "Id", discVars)
ggplot(dt, aes(x = value)) +
geom_bar() +
facet_wrap(~ variable, scales = "free") +
theme_bw()
As summary, the value range, and cardinality are computed.
dt <- full[, discVars, with = FALSE]
df <- data.frame(
Card = apply(dt, 2, function(x) length(unique(x[!is.na(x)]))),
Min = apply(dt, 2, function(x) min(as.numeric(as.character(x)), na.rm = TRUE)),
Max = apply(dt, 2, function(x) max(as.numeric(as.character(x)), na.rm = TRUE))
)
Apart from that, the modes are usually interesting. For this a function has to be defined first.
Mode <- function(x, fst = TRUE) {
ux <- unique(x)
tab <- tabulate(match(x, ux))
modeIdx <- which.max(tab)
if (fst) return(list(value = ux[modeIdx], freq = tab[modeIdx]))
tab[modeIdx] <- 0
modeIdx <- which.max(tab)
return(list(value = ux[modeIdx], freq = tab[modeIdx]))
}
First and second modes are computed and the table is shown. Modes that make up more than 50% of observations will be highlighted.
df$Mode <- apply(dt, 2, function(x) Mode(x)$value)
df$ModeCount <- apply(dt, 2, function(x) Mode(x)$freq)
df$Mode2nd <- apply(dt, 2, function(x) Mode(x, fst = FALSE)$value)
df$ModeCount2nd <- apply(dt, 2, function(x) Mode(x, fst = FALSE)$freq)
DT::datatable(df, options = list(dom = "t", pageLength = 40)) %>%
formatStyle('ModeCount',
color = styleInterval(nrow(dt) / 2, c("black", 'orange')))
In GarageYrBlt the maximum value is 2207, which does not make sense. It seems that this is a typo and the real value is 2007. After checking that there are not more values above 2010 2207 is changed to 2007.
full[GarageYrBlt == 2207, GarageYrBlt := "2007"]
Ordinal Variables
dt <- melt(full, "Id", ordVars)
ggplot(dt, aes(x = value)) +
geom_bar() +
facet_wrap(~ variable, scales = "free") +
theme_bw()
Nominal Variables
dt <- melt(full, "Id", nomVars)
ggplot(dt, aes(x = value)) +
geom_bar() +
facet_wrap(~ variable, scales = "free") +
theme_bw()
Summary
Cardinality, 1st and 2nd modes are given as summary. Modes that make up more than 50% of observations will be highlighted.
dt <- full[, c(ordVars, nomVars), with = FALSE]
df <- data.frame(
Card = apply(dt, 2, function(x) length(unique(x[!is.na(x)]))),
Mode = apply(dt, 2, function(x) Mode(x)$value),
ModeCount = apply(dt, 2, function(x) Mode(x)$freq),
Mode2nd = apply(dt, 2, function(x) Mode(x, fst = FALSE)$value),
ModeCount2nd = apply(dt, 2, function(x) Mode(x, fst = FALSE)$freq),
Type = rep(c("ord", "nom"), c(length(ordVars), length(nomVars)))
)
DT::datatable(df, options = list(dom = "t", pageLength = 40)) %>%
formatStyle('Type',
color = styleEqual(unique(df$Type), c('orange', 'blue'))) %>%
formatStyle('ModeCount',
color = styleInterval(nrow(dt) / 2, c("black", 'orange')))
vars <- c(contVars, discVars)
The Pearson Correlation Coefficient is calculated for all feature pairs where both features are continuous. The results are shown as heatmap below. Using euclidean distance of the correlation matrix as a metric rows and columns were hierarchically ordered (complete link).
m <- cor(
full[, contVars, with = FALSE],
use = "complete.obs"
)
corrplot(m, method = "color", order = "hclust")
Some clusters are visible. The same hierarchical clustering is shown as tree below.
corrClust <- hclust(dist(m))
plot(corrClust)
For categorical features Cramer’s V is calculated for all feature pairs to indicate correlation.
getCramerV <- function(m, j) {
vapply(seq_len(ncol(m)), function(j) {
assocstats(table(m[, i], m[, j]))$cramer
}, numeric(1))
}
Here, discrete variables are included.
vars <- c(discVars, ordVars, nomVars)
The results are shown as heatmap below. Using euclidean distance of the correlation matrix as a metric rows and columns were hierarchically ordered (complete link).
m <- as.matrix(full[, vars, with = FALSE])
m <- m[complete.cases(m), ]
mCramer <- matrix(NA, ncol(m), ncol(m))
for (i in seq_len(ncol(m))) {
mCramer[i, ] <- vapply(
seq_len(ncol(m)),
function(j) assocstats(table(m[, i], m[, j]))$cramer,
numeric(1)
)
}
The hierarchical clustered correlation plot is shown below.
colnames(mCramer) <- colnames(m)
rownames(mCramer) <- colnames(m)
corrplot(mCramer, method = "color", order = "hclust")
The tree for the clustering:
corrClust <- hclust(dist(mCramer))
plot(corrClust)
Featrues with missing values are.
# featAttr[missing > 0, ]
Imputation methods are defined.
# featAttr$impMethod <- ""
Continuous variables should be imputed using predictive mean matching, binary variables using logistic regression, other discrete and ordinal variables using proportional odds, others using polytomous regression.
# featAttr[missing > 0, impMethod := "polyreg"]
# featAttr[missing > 0 & type == "ordinal", impMethod := "polr"]
# featAttr[missing > 0 & type == "discrete", impMethod := "polr"]
# featAttr[missing > 0 & card == 2, impMethod := "logreg"]
# featAttr[missing > 0 & type == "continuous", impMethod := "pmm"]
# imp <- mice(full, method = c("", featAttr$impMethod, ""))
# imp <- mice(full)
# imp <- mice(full[, .(Utilities)])
# str(full)
# unique(full$Utilities)
#
# imp <- mice(full, seed = 42)
# ?mice
#